library(randomForest) library(MASS) library(RLT) trainN = 200 testN = 1000 P = 100 ########### Scenario 3 in RLT paper ################### Di <- 1:P rho=0.5 H <- abs(outer(Di, Di, "-")) V=rho^H dataX=as.matrix(mvrnorm(trainN+testN, mu=rep(0,P), Sigma=V)) colnames(dataX) <- paste("X", c(1:P), sep="") Link = 5*dataX[,10] + 5*dataX[,30] + 10*dataX[,50] Link = 5*dataX[,10]*dataX[,30] Y = rnorm(trainN+testN, mean = Link, sd = 1) traindata = data.frame(as.matrix(cbind(dataX, Y))[1:trainN,]) testdata = data.frame(as.matrix(cbind(dataX, Y))[(trainN+1):(trainN+testN),]) ################### fit RLT model ################ ################### default setting, no reinforcement learning ( in par with randomForests) RLT.fit = RLT(Y~ ., traindata=traindata, model = "regression") RLT.predict = predict(RLT.fit, testdata) var(RLT.predict - testdata$Y)barplot(RLT.fit$VarImportance) ## with reinforcement learning, with parallel computing! (this one does not have variable importance measure ...) RLT.fit = RLT(Y~ ., traindata=traindata, model = "regression", ntrees = 50, nmin = floor(trainN^(1/3)), use_cores = 4, split_gen = "Rank", resample = FALSE, reinforcement = TRUE, muting = 0, muting_percent = 0.4, protectVar = sqrt(ncol(traindata)), combsplit_th = 0.5, combsplit = 1) RLT.predict = predict(RLT.fit, testdata) var(RLT.predict - testdata$Y) ## with reinforcement learning, with very heavy tuning... you can of course set protectVar = 2 to sometimes get better result RLT.fit = RLT(Y~ ., traindata=traindata, model = "regression", ntrees = 50, nmin = floor(trainN^(1/3)), use_cores = 4, split_gen = "Rank", resample = FALSE, reinforcement = TRUE, muting = -1, muting_percent = 0.6, protectVar = sqrt(P), combsplit_th = 0.3, combsplit = 2) RLT.predict = predict(RLT.fit, testdata) var(RLT.predict - testdata$Y) ## with reinforcement learning, with resampling to calculate variable importance, which will sacrifice some accuracy RLT.fit = RLT(Y~ ., traindata=traindata, model = "regression", ntrees = 50, nmin = floor(trainN^(1/3)), use_cores = 4, split_gen = "Rank", resample = TRUE, resample_prob = 0.85, reinforcement = TRUE, muting = -1, muting_percent = 0.6, protectVar = sqrt(P), combsplit_th = 0.3, combsplit = 2) RLT.predict = predict(RLT.fit, testdata) var(RLT.predict - testdata$Y) barplot(RLT.fit$VarImportance) ## change embedded model ntrees and sample probability RLT.fit = RLT(Y~ ., traindata=traindata, model = "regression", ntrees = 50, nmin = floor(trainN^(1/3)), use_cores = 4, split_gen = "Rank", resample = TRUE, resample_prob = 0.85, reinforcement = TRUE, muting = -1, muting_percent = 0.6, protectVar = sqrt(P), combsplit_th = 0.3, combsplit = 2, ntree_embed = 50, resample_prob_embed = 0.85) RLT.predict = predict(RLT.fit, testdata) var(RLT.predict - testdata$Y)barplot(RLT.fit$VarImportance) ################ to compare with random forests ############### RF = randomForest(Y ~ ., data=traindata, ntree = 500) PRF = predict(RF, newdata = testdata)var(testdata$Y - PRF) barplot(t(RF$importance))